In [186]:
    
from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
from matplotlib import pylab
import matplotlib as mpl
%matplotlib inline
mpl.rcParams['figure.figsize'] = (14,8)
pd.options.display.mpl_style = 'default'
#pd.set_option('display.multi_sparse', True)
from pprint import pprint
from pprint import pformat
#pickled_dbase = "c:/db_pickles/pickle - dbase - 2014-07-28a.pickle"
#dbase = pd.read_pickle(pickled_dbase)
sms_tasknames = ['T1_SMS_5',       'T1_SMS_8',
                 'Ticks_ISO_T2_5', 'Ticks_ISO_T2_8',
                 'Ticks_Linear_5', 'Ticks_Linear_8',
                 'Ticks_Phase_5',  'Ticks_Phase_8',
                 'Jits_ISO_5',     'Jits_ISO_8',
                 'Jits_Linear_5',  'Jits_Linear_8',
                 'Jits_Phase_5',   'Jits_Phase_8', ]
# Participants that are excluded from all performance analysis
pilot_data = ['010', '011', '012', '013', '014',]
non_english_fluent  = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]
pro_inst_skill = ['026', '037']
excluded_all_tasks = pilot_data + non_english_fluent + left_handed + pro_inst_skill
    
In [187]:
    
param_all_tasks = lambda v: {task: v for task in sms_tasknames}
sms_params_entry = {
    'stimulus_timing': {
        'T1_SMS_5':       'iso',
        'Ticks_ISO_T2_5': 'iso',
        'Ticks_Linear_5': 'linear',
        'Ticks_Phase_5':  'phase',
        'Jits_ISO_5':     'iso',
        'Jits_Linear_5':  'linear',
        'Jits_Phase_5':   'phase',
        'T1_SMS_8':       'iso',
        'Ticks_ISO_T2_8': 'iso',
        'Ticks_Linear_8': 'linear',
        'Ticks_Phase_8':  'phase',
        'Jits_ISO_8':     'iso',
        'Jits_Linear_8':  'linear',
        'Jits_Phase_8':   'phase',
        },
    'stimulus_style': {
        'T1_SMS_5':       'tick',
        'Ticks_ISO_T2_5': 'tick',
        'Ticks_Linear_5': 'tick',
        'Ticks_Phase_5':  'tick',
        'Jits_ISO_5':     'jitter',
        'Jits_Linear_5':  'jitter',
        'Jits_Phase_5':   'jitter',
        'T1_SMS_8':       'tick',
        'Ticks_ISO_T2_8': 'tick',
        'Ticks_Linear_8': 'tick',
        'Ticks_Phase_8':  'tick',
        'Jits_ISO_8':     'jitter',
        'Jits_Linear_8':  'jitter',
        'Jits_Phase_8':   'jitter',
        },
    'ISI': {
        'T1_SMS_5':       500,
        'Ticks_ISO_T2_5': 500,
        'Ticks_Linear_5': '(varies)',
        'Ticks_Phase_5':  500,
        'Jits_ISO_5':     500,
        'Jits_Linear_5':  '(varies)',
        'Jits_Phase_5':   500,
        'T1_SMS_8':       800,
        'Ticks_ISO_T2_8': 800,
        'Ticks_Linear_8': '(varies)',
        'Ticks_Phase_8':  800,
        'Jits_ISO_8':     800,
        'Jits_Linear_8':  '(varies)',
        'Jits_Phase_8':   800,
        },
    #used in filtering step
    'wait_beats_after_subj_start':                 param_all_tasks(12),
    #used in assigning outlier status / "outlier metric"
    'minimum_percent_deviation_to_keep': param_all_tasks(-35),
    'maximum_percent_deviation_to_keep': param_all_tasks(+20),
    }
#reshape to task>param so parameter lists can be selected by task
sms_params = {task: {param_type: taskparams[task] 
                     for (param_type, taskparams) 
                     in sms_params_entry.items()}
              for task in sms_tasknames}
phase_shift_beats = {800: OrderedDict([
                           (30, -20),
                           (48, +20),
                           (64, +40),
                           (81, -40),
                           (97, -80),
                           (114, +80),
                           (131, +160),
                           (150, -160)]),
                     500: OrderedDict([
                           (64, +20),
                           (81, -20),
                           (97, -50),
                           (114, +50),
                           (131, +100),
                           (150, -100)])
                     }
    
In [188]:
    
short_name = {'T1_SMS_5': 'iso5t1',       
               'T1_SMS_8': 'iso8t1',
               'Ticks_ISO_T2_5': 'iso5t2', 
               'Ticks_ISO_T2_8': 'iso8t2',
               'Ticks_Linear_5': 'lin5t', 
               'Ticks_Linear_8': 'lin8t',
               'Ticks_Phase_5': 'phase5t', 
               'Ticks_Phase_8': 'phase8t',
               'Jits_ISO_5': 'iso5j',    
               'Jits_ISO_8': 'iso8j',
               'Jits_Linear_5': 'lin5j', 
               'Jits_Linear_8': 'lin8j',
               'Jits_Phase_5': 'phase5j',  
               'Jits_Phase_8': 'phase8j',
               }
sms_shortnames = short_name.values()
long_name = {v: k for (k, v) in short_name.items()}
print(sms_shortnames)
    
    
In [189]:
    
def general_task_pid_iterator(label_tasks=True, 
                              label_pids=True, 
                              concise_labels=False,
                              skip_to_task=None,
                              skip_to_pid=None):
    for t in sms_tasknames:
        
        if skip_to_task:
            if t != skip_to_task:
                continue
            else:
                skip_to_task = None
                
        if label_tasks:
            if concise_labels:
                print('\n' + t)
            else:
                print('='*80 + '\n' + t + '\n' + '='*80)
                
        for pid in task_pids[t]:
            
            if skip_to_pid:
                if pid != skip_to_pid:
                    continue
                else:
                    skip_to_pid = None
                    
            if label_pids:
                if concise_labels:
                    print(pid, end=',')
                else:
                    print('-'*60)
                    print('P. ' + pid)
                    
            yield (t, pid)
    
In [190]:
    
import cPickle as pickle
pfile = "c:/db_pickles/pickle - smsbeats - 2014-10-03b.pickle"
with open(pfile) as f:
    task_frames = pickle.load(f)
task_pids = {}
for (k, v) in task_frames.items():
    pids = sorted(set(v.index.get_level_values('pid')))
    task_pids[k] = [p for p in pids if p not in excluded_all_tasks]
    
In [191]:
    
for t in task_frames.keys():
    task_frames[t] = task_frames[t].drop(excluded_all_tasks, level='pid')
    
In [192]:
    
for k, v in task_pids.items():
    print(k, '\t', len(v))
    
    
In [193]:
    
# This function was pulled out from the earlier processing step-- instead
# of deciding what's an outlier while doing the initial data processing,
# we can look at it here, and perhaps experiment with different settings.
# (don't just mindlessly maximize reliability, though-- there could certainly
# be reliable aspects of the data that we still want to remove-- e.g., 
# how many beats a P waits to start, how often they skip a tap...)
def filter_taps(df,
                task_params,
                print_results=False):
    '''
    Input: a DataFrame consisting of the unfiltered list of taps (without targets).
    Output: the dataframe with outlying taps tagged and startup beats removed.
    '''
    # drop initial beats from task recording: [n] beats from start of task
    # or [n] beats from the participant's first tap, whichever comes later
    
    nonfail_beats = df[df.is_failure == False].index.tolist()
    first_played_beat = min(nonfail_beats)
    
    #beatdrop1 = task_params['wait_beats_after_task_start']
    beatdrop2 = first_played_beat + task_params['wait_beats_after_subj_start']
    beats_to_drop_from_start = beatdrop2 #max([beatdrop1, beatdrop2])    
    
    df = df[beats_to_drop_from_start:]   #slice by index name (zero-indexed beats)
    
    # temporarily remove outliers to form a distribution of typical 
    # values, which we'll use to form upper and lower limits for 
    # filtering in the following step.
    # "from left" = the largest negative deviations, 
    # "from right" = the largest positive deviations
    #nworst_left = task_params['stdev_calcs_exclude_n_from_left']
    #nworst_right = task_params['stdev_calcs_exclude_n_from_right']
    #df_adj  = df[(df.dev_perc > max(df.dev_perc.nsmallest(nworst_left))) & 
    #             (df.dev_perc < min(df.dev_perc.nlargest(nworst_right)))]
    
    # Actual filtering of the values based on the temporary distribution
    # created above. (This way, we retain the biggest deviations, as long
    # as they aren't actually outliers.)
    #rem_beyond_stds = task_params['filter_outliers_beyond_x_stdevs']    
    #trimmed_mean = df_adj.dev_perc.mean()
    #trimmed_std =  df_adj.dev_perc.std()
    #upper_limit = trimmed_mean + (trimmed_std * rem_beyond_stds)
    #lower_limit = trimmed_mean - (trimmed_std * rem_beyond_stds)
    
    lower_limit = task_params['minimum_percent_deviation_to_keep'] # -35
    upper_limit = task_params['maximum_percent_deviation_to_keep'] # +20
    
    #df_filt = df[(df.dev_perc <= upper_limit) & 
    #             (df.dev_perc >= lower_limit)]
    
    df['is_outlier'] = False
    df.is_outlier = (  (df.dev_perc > upper_limit) 
                     | (df.dev_perc < lower_limit))
    #devperc_failure = task_params['min_percentISI_deviation_counted_as_failure']
        
    #return df_filt
    return df
    
In [194]:
    
def label_shift_ranges(task_taps_df):
    '''input: a taps-only df for a single task (all participants).
       output: the same df with ranges labeled ('is_range_1a' etc.)
    '''
    
    #label slicing is end-inclusive, so don't overlap beat numbers
    
    # one beat removed from the start of each "a" range (a player isn't
    # expected to be in synch with the actual perturbed tap, but we start
    # measuring when the next one comes.)
    # (the '0' range is before all the large shifts. Just placeholders...)
    
#    shift_ranges = {0: {'a': (  0,  96),
#                        'b': (  0,  96),},
#                    1: {'a': ( 98, 104), 
#                        'b': (105, 113),},
#                    2: {'a': (115, 121),
#                        'b': (122, 130),},
#                    3: {'a': (132, 139),
#                        'b': (140, 149),},
#                    4: {'a': (151, 159), 
#                        'b': (160, 169),},
#                    }
    #shifts: ... 97, 114, 131, 150
    #get the next four beats
    
    shift_ranges = {0: {'a': (  0,  96),
                        'b': (  0,  96),},
                    1: {'a': ( 99, 101), 
                        'b': (102, 113),},
                    2: {'a': (116, 118),
                        'b': (119, 130),},
                    3: {'a': (133, 135),
                        'b': (136, 149),},
                    4: {'a': (152, 154), 
                        'b': (155, 169),},
                    }
    
    def getbeatxs(df):
        # groupby/apply doesn't seem to be set up well for selecting
        # particular rows from each value of a multiindex... here, we'll
        # have to remove the 'pid' index explicitly I guess.
        df = df.reset_index('pid').drop('pid', axis=1)
        
        for i in [0,1,2,3,4]:
            for j in ['a','b']:
                srx = shift_ranges[i][j]
                label = 'is_range_' + str(i) + j
                df[label] = False
                df.loc[srx[0]:srx[1], label] = True
                
        label = 'is_shiftedarea'
        df[label] = False
        df.loc[shift_ranges[1]['a'][0]:shift_ranges[1]['b'][1], label] = True
        df.loc[shift_ranges[2]['a'][0]:shift_ranges[2]['b'][1], label] = True
        df.loc[shift_ranges[3]['a'][0]:shift_ranges[3]['b'][1], label] = True
        df.loc[shift_ranges[4]['a'][0]:shift_ranges[4]['b'][1], label] = True
        return df
    
    g = task_taps_df.groupby(level='pid')
    return g.apply(getbeatxs)
    
In [196]:
    
#taps only (remove "target" data)
db_taps = {t: df.xs('tap', level='stamp')
           for (t, df) in task_frames.items()}
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8',
               'Jits_Phase_5',  'Jits_Phase_8', ]
for t in phase_tasks:
    db_taps[t] = label_shift_ranges(db_taps[t])
    print(t + ": shift range labels")
taps_filtered = OrderedDict()
outlier_rem_record = {}
for t in sms_tasknames:
    
    print('\n' + t)
    tdata = db_taps[t]
    tparams = sms_params[t]
    
    outlier_rem_record[t] = {}
    tdata_filt = {}
    for pid in task_pids[t]:        
        print(pid, end=",")
        pdata = tdata.xs(pid)
        
        #print(max(pdata.index))
        
        #filters out certain intervals and adds "is_outlier" field
        filtered_a = filter_taps(pdata, tparams)
        
        #remove beats based on "is_outlier_ field, as added by filter_taps()
        #but don't remove outliers in phase tasks
        
        if t in phase_tasks:
            filtered_b = filtered_a
        else:
            filtered_b = filtered_a[filtered_a.is_outlier != True]        
        #Need to worry about how to filter the phase tasks later...
        
        tdata_filt[pid] = filtered_b
        
        
        outlier_rem_record[t][pid] = len(filtered_a) - len(filtered_b)
        
        
    taps_filtered[t] = pd.concat(tdata_filt, names=['pid']) 
    
    mean_rem = round(np.mean(outlier_rem_record[t].values()),1)
    std_rem =  round(np.std(outlier_rem_record[t].values()),1)
    max_rem = max(outlier_rem_record[t].values())
    print('\n outlier beats removed per P.: mean={}, sd={}, max={}'
          .format(mean_rem, std_rem, max_rem)) 
    
    print('\n' + '=' * 70)
    
    
In [197]:
    
taps_filtered['Ticks_Phase_5'].tail().T
    
    Out[197]:
In [198]:
    
def fig_dims(width, factor):
    #WIDTH = 350.0  # the number latex spits out
    #FACTOR = 0.45  # the fraction of the width you'd like the figure to occupy
    fig_width_pt  = width * factor
    inches_per_pt = 1.0 / 72.27
    golden_ratio  = (np.sqrt(5) - 1.0) / 2.0  # because it looks good
    fig_width_in  = fig_width_pt * inches_per_pt  # figure width in inches
    fig_height_in = fig_width_in * golden_ratio   # figure height in inches
    fig_dims      = [fig_width_in, fig_height_in] # fig dims as a list
    return fig_dims
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None  
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
   
def task_hists(tdata):
    figsize = fig_dims(2000, 0.45)    
    ax = avgtargs.plot(y = 'tinterval', linewidth=2, color='black', figsize=figsize)    
    avg_tap = avgtargs.tinterval + avgdevs.dev
    upper_sd = avg_tap + SD_devs.dev
    lower_sd = avg_tap - SD_devs.dev 
    
    #upper_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
    #lower_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")    
    #ax.plot(upper_sd.dev, linewidth=3, color='black', linestyle="--")
    #ax.plot(lower_sd.dev, linewidth=3, color='black', linestyle="--")
    
    avg_tap.plot(linewidth=1, color='black', linestyle="--", dashes=(5,3))
    upper_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
    lower_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
        
    ax.set_ylabel("Milliseconds")
    ax.set_xlabel("Interval number")    
    ax.grid(b=False, which='major', axis='both')
    
    # set number of labeled "ticks" on each axis (overriding auto setting)
    #ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
    #ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(10))
    # (it will sometimes decide to show fewer than this, hence "max")    
    # Or to be precise:
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
    
    #ax.xaxis.tick_bottom()
    #ax.yaxis.tick_left()
    #ax.spines["right"].set_color("none")
    #ax.spines["top"].set_color("none")
    
    ax.legend(["Target stimulus interval (TSI)",
               "TSI + mean of absolute performance asynchronies",
               u"Between-participants variability in absolute performance asynchronies (TSI ± 1 SD)"], loc="best")    
    ax.get_legend().set_title("")
    ax.get_legend().draw_frame(False)
    
    plt.show()
    
    
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
# iso5t1 and iso8t1: Need to remove the extra intervals at the 
# end of the task from the first few subs! (after beat 130-ish?)
# (Probably easiest and less confusing for future readers if they're just
#  chopped out of the CSV file at the start.)
    
In [199]:
    
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None  
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 35
rcParams['xtick.labelsize'] = 16
rcParams['ytick.labelsize'] = 16
rcParams['legend.fontsize'] = 16
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
    
In [200]:
    
testdf = db_taps[long_name['phase5t']]
testdf[testdf.is_shiftedarea == True]
    
    Out[200]:
In [201]:
    
tasks_using = sorted([k for k in db_taps.keys() if 'T1_SMS' not in k])
#print(tasks_using)
stack_dev_data = {t: db_taps[t] for t in tasks_using}
for t in tasks_using:
    tdatadf = stack_dev_data[t]
    
    if t in phase_tasks:
        tdatadf = tdatadf[tdatadf.is_shiftedarea == True]
    
    tdata = tdatadf.dev_perc
    
    len_all = tdata.count()
    num_pids = len(tdata.index.get_level_values('pid').unique())
    print(short_name[t])
    print("i = ", len_all)
    print("N = ", num_pids)
    plt.figure()
    tdata.hist(figsize=(5,5), bins=60, color='white', grid=False, range=(-50,50))
    #plt.show()
    #plt.tight_layout() 
    plt.savefig("c:/_Sync/1020_histograms_raw_" + short_name[t] + '2.png',
                format='png',
                )
#n, bins, patches = plt.hist(db_taps[ 30, stacked=True, normed = True)
#plt.figure()
#plt.hist(stack_dev_data, stacked=True)
#plt.show()
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [203]:
    
#Phase tasks: portions NOT in a post-phase-shift period
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8', 'Jits_Phase_5', 'Jits_Phase_8']
phase_normal_sections = {}
phase_post_shift_sections = {}
phase_post_shift_all = {}
for t in phase_tasks:
    # The only filtering done so far was to remove the 
    # first 12 beats after participant began tapping:
    pdf = taps_filtered[t]
    normal_period = pdf[  (pdf.is_range_0a==True)
                        | (pdf.is_range_1b==True) 
                        | (pdf.is_range_2b==True) 
                        | (pdf.is_range_3b==True) 
                        | (pdf.is_range_4b==True)]
    
    post_shift = pdf[  (pdf.is_range_1a==True)
                     | (pdf.is_range_2a==True) 
                     | (pdf.is_range_3a==True) 
                     | (pdf.is_range_4a==True)]
    
    post_shift_all = pdf[pdf.is_shiftedarea==True]
    
    phase_normal_sections[t] = normal_period
    phase_post_shift_sections[t] = post_shift
    phase_post_shift_all[t] = post_shift_all
    
In [204]:
    
responses_possible = 65. #intervals possible for each phase task (post-shift regions)
response_count = {}
for t in phase_tasks:
    tdata = phase_post_shift_all[t]
    for p in task_pids[t]:
        responsecount = tdata.dev_perc.xs(p).count()
        responsep = responsecount / responses_possible
        response_p_dist.append(responsep)
        if responsep < 0.9:
            print(p, round(responsep,2))
    
    
In [207]:
    
measurement_region_overall_distributions = {'mean': {},
                                            'sd': {},}
print("Before adjustment: \n\n")
for t, df in phase_post_shift_all.items():
    print(t)
    
    subs = df.groupby(level='pid').dev_perc
    mean_of_means = subs.mean().mean()
    mean_of_sds = subs.std().mean()
    count = subs.mean().count()
    
    measurement_region_overall_distributions['mean'][t] = mean_of_means
    measurement_region_overall_distributions['sd'][t] = mean_of_sds
    
    print('mean: %s' % mean_of_means)
    print('sd: %s' % mean_of_sds)
    print('n: %s' % count)
print("\n\nAfter adjustment: \n\n")
for t, df in phase_post_shift_all.items():
    print(t)
    dist_mean = measurement_region_overall_distributions['mean'][t]
    dist_sd = measurement_region_overall_distributions['sd'][t]
    subs = phase_post_shift_all[t].copy()
    
    for p in task_pids[t]:
        devp = subs.xs(p).dev_perc
        upper_limit = 20
        lower_limit = -35
        devp[devp > upper_limit] = upper_limit
        devp[devp < lower_limit] = lower_limit
        
        #missing = len(devp) - devp.count()
        #if missing > 18:
        #    print(p, missing)
    
    #subs_replacena = subs.copy()
    #subs_replacena[subs_replacena.dev_perc.isnull()] = dist_mean + (1 * dist_sd)
    
    taps_filtered[t + "_mperiod_noreplace"] = subs
    taps_filtered[t + "_mperiod_replaced"] = subs
    
    task_pids[t + "_mperiod_noreplace"] = task_pids[t]
    task_pids[t + "_mperiod_replaced"] = task_pids[t]
    
    
    #recalc after adjustments
    print(t)
    devs = subs_replacena.groupby(level='pid').dev_perc
    mean_of_means = devs.mean().mean()
    mean_of_sds = devs.std().mean()
    count = devs.mean().count()
    print('mean: %s' % mean_of_means)
    print('sd: %s' % mean_of_sds)
    print('n: %s' % count)
    
    
In [208]:
    
post_shift_overall_distributions = {'mean': {},
                                    'sd': {},}
print("Before adjustment: \n\n")
for t, df in phase_post_shift_sections.items():
    print(t)
    
    subs = df.groupby(level='pid').dev_perc
    mean_of_means = subs.mean().mean()
    mean_of_sds = subs.std().mean()
    count = subs.mean().count()
    
    post_shift_overall_distributions['mean'][t] = mean_of_means
    post_shift_overall_distributions['sd'][t] = mean_of_sds
    
    print('mean: %s' % mean_of_means)
    print('sd: %s' % mean_of_sds)
    print('n: %s' % count)
print("\n\nAfter adjustment: \n\n")
for t, df in phase_post_shift_sections.items():
    
    dist_mean = post_shift_overall_distributions['mean'][t]
    dist_sd = post_shift_overall_distributions['sd'][t]
    subs = phase_post_shift_sections[t].copy()
    upper_limit = dist_mean + (2 * dist_sd)
    lower_limit = dist_mean - (2 * dist_sd)
    subs[subs.dev_perc > upper_limit] = upper_limit
    subs[subs.dev_perc < lower_limit] = lower_limit
    
    subs_replacena = subs.copy()
    subs_replacena[subs_replacena.dev_perc.isnull()] = dist_mean + (1 * dist_sd)
    taps_filtered[t + "_postshift_noreplace"] = subs
    taps_filtered[t + "_postshift_replaced"] = subs
    
    task_pids[t + "_postshift_noreplace"] = task_pids[t]
    task_pids[t + "_postshift_replaced"] = task_pids[t]
    
    
    #recalc after adjustments
    print(t)
    devs = subs_replacena.groupby(level='pid').dev_perc
    mean_of_means = devs.mean().mean()
    mean_of_sds = devs.std().mean()
    count = devs.mean().count()
    print('mean: %s' % mean_of_means)
    print('sd: %s' % mean_of_sds)
    print('n: %s' % count)
    
    
In [209]:
    
#treat the sections not just after a phase shift like the other tasks...
#t = 'Ticks_Phase_8'
for t in phase_tasks:
    alt_taskname = t + "_normal"
    print(alt_taskname)
    tparams = sms_params[t]
    tdata = phase_normal_sections[t]
    tdata_filt = {}
    outlier_rem_record[alt_taskname] = {}
    for pid in task_pids[t]:
        pdata = tdata.xs(pid)
        filtered_a = filter_taps(pdata, tparams)
        filtered_b = filtered_a[filtered_a.is_outlier != True] 
        tdata_filt[pid] = filtered_b
        outlier_rem_record[t][pid] = len(filtered_a) - len(filtered_b)
    
    taps_filtered[alt_taskname] = pd.concat(tdata_filt, names=['pid']) 
    task_pids[alt_taskname] = task_pids[t]
    mean_rem = round(np.mean(outlier_rem_record[t].values()),1)
    std_rem =  round(np.std(outlier_rem_record[t].values()),1)
    max_rem = max(outlier_rem_record[t].values())
    print('\n outlier beats removed per P.: mean={}, sd={}, max={}'
          .format(mean_rem, std_rem, max_rem)) 
    
    print('\n' + '=' * 70)
    
    
In [210]:
    
taps_filtered.keys()
    
    Out[210]:
In [211]:
    
def sideplots(title, serieslist, namelist, **kwargs):
    
    from matplotlib import pyplot as plt
    
    assert len(serieslist) == len(namelist)
    count = len(serieslist)   
    
    fig, axes = plt.subplots(nrows=count, ncols=3, **kwargs)
    
    colors=['blue', 'green', 'red', 'white', 'orange']
    
    plots = [(namelist[i], serieslist[i]) for i in range(count)]
    
    for (i, (n, s)) in enumerate(plots):
        
        try:
            plot_color = colors[i]
        except IndexError:
            plot_color = 'blue'
            
        ax_hist = plt.subplot2grid((count, 3), (i, 0), colspan=2)
        ax_hist.set_title(n, fontsize=12)
        
        ax_line = plt.subplot2grid((count, 3), (i, 2), colspan=1)
        ax_line.set_title(n, fontsize=12)
        
        s.plot(ax=ax_line, linewidth=3, color=plot_color)
        s.hist(ax=ax_hist, bins=20, color=plot_color)
        
    fig.suptitle(title, fontsize=22)
    plt.show()
    
    #fig.tight_layout()
    
In [212]:
    
short_name
    
    Out[212]:
In [174]:
    
t = long_name['phase5t']
for pid in task_pids[t]:
        
    tdf = phase_post_shift_sections[t].xs(pid)  #(unfiltered)
    
    first_non_miss = tdf[tdf.is_failure==False]
    first_beat_tapped = min(first_non_miss.index)
    
    n = 12
    after_first_n = tdf.ix[first_beat_tapped + n:]    
    missed_beats_count = len(after_first_n[after_first_n.is_failure==True])
    sdf = after_first_n.dev_perc
    
    full_count = len(after_first_n)
    
    filt_df = sdf[(sdf >= -35) & (sdf <= 20)]
    
    filt_count = len(sdf) - len(filt_df)
    good_count = len(filt_df)
    
    pct_retained = 100 * good_count / full_count
    print(str(pct_retained) +  '%')
    if pct_retained < 75: print("\n\n\n!!!!\n")
        
    sideplots(title = "P. {} - misses: {} - filtered out: {} - OK: {}".format(pid, 
                                                                              missed_beats_count, 
                                                                              filt_count,
                                                                              good_count),
              serieslist=[sdf, filt_df], 
              namelist=['raw', 'filtered'],
              figsize=(19,6))
    #if pid == '020': break
    
    
    
    
    
    
    
    
    
    
In [20]:
    
#iso5t2: -30 to 30 looks good
#iso8t2: 
    #p. 49 mostly halfway between beats sometimes...
    #p. 55 consistently tapping halfway between beats
#lin5t:
    #p. 089 switches to half-beat tapping for the last portion of the trial
    #switching to +/- 25%
#lin8t:
    #p. 073 used half-taps
    
#iso5j (jitters):
    #switching to -35 to +20
    
#pid='015'
    
In [151]:
    
t = long_name['iso8j']
for pid in task_pids[t]:
        
    tdf = db_taps[t].xs(pid)  #(unfiltered)
    
    first_non_miss = tdf[tdf.is_failure==False]
    first_beat_tapped = min(first_non_miss.index)
    
    n = 12
    after_first_n = tdf.ix[first_beat_tapped + n:]    
    missed_beats_count = len(after_first_n[after_first_n.is_failure==True])
    sdf = after_first_n.dev_perc
    
    full_count = len(after_first_n)
    
    #filt_df = sdf[(sdf >= -35) & (sdf <= 20)]
    filt_df = taps_filtered[t].xs(pid).dev_perc
    
    filt_count = len(sdf) - len(filt_df)
    good_count = len(filt_df)
    
    pct_retained = 100 * good_count / full_count
    print(str(pct_retained) +  '%')
    if pct_retained < 75: print("\n\n\n!!!!\n")
        
    sideplots(title = "P. {} - misses: {} - filtered out: {} - OK: {}".format(pid, 
                                                                              missed_beats_count, 
                                                                              filt_count,
                                                                              good_count),
              serieslist=[sdf, filt_df], 
              namelist=['raw', 'filtered'],
              figsize=(19,6))
    #if pid == '020': break
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [214]:
    
insufficient_data_pids = {}
minimum_prop = 0.70  # must have at least 70% of the the data non-missed
                # and in the keepable range (-35 to 20 percent deviation)
data_proportion = {}
# Three participants (49, 55, 73) have already been removed due to an earlier
# interation of this analysis that showed that they each had fewer than half
# of the tasks meeting the 70% qualifying data criterion.
for t in sms_tasknames:
    print(t)
    
    devs = taps_filtered[t].groupby(level='pid').dev_perc
    
    task_beats_max = devs.count().max()    
    print('Max beats: %s' % task_beats_max)
    
    proportion = devs.count() / task_beats_max
    below_cutoff = proportion[proportion < minimum_prop]
    
    print(below_cutoff)
    
    
In [218]:
    
short_name
    
    Out[218]:
In [219]:
    
#From arduino apparatus code:
#define LINEAR_800_STARTING_ISI     820   //  
#define LINEAR_800_PCHANGE_EVERY    5     // decrease by 10 ms every five intervals (avg. -2ms per interval)
#define LINEAR_800_PCHANGE_AMOUNT   -10
#define LINEAR_500_PCHANGE_EVERY    5     // increase by 10 ms every five intervals (avg. +2ms per interval)
#define LINEAR_500_PCHANGE_AMOUNT   10
#define LINEAR_500_STARTING_ISI     480   //
#tasks go from beat 0 to beat 169: start at 820, end at [820 - (165 * 10 / 5)] = 490
#                                  start at 480, end at [480 + (165 * 10 / 5)] = 810
#Splitting into thirds (by interval count, not time duration!):
linear_tasks = ['Ticks_Linear_5', 'Ticks_Linear_8',
                'Jits_Linear_5',  'Jits_Linear_8']
linear_part_taps = OrderedDict()
linear_part_dfs = OrderedDict()
for t in linear_tasks:
    
    subtask_name_A = t + "ptA"
    subtask_name_B = t + "ptB"
    subtask_name_C = t + "ptC"
    
    task_pids[subtask_name_A] = task_pids[t]
    task_pids[subtask_name_B] = task_pids[t]
    task_pids[subtask_name_C] = task_pids[t]
    
    linear_part_taps[subtask_name_A] = {}
    linear_part_taps[subtask_name_B] = {}
    linear_part_taps[subtask_name_C] = {}
    
    for p in task_pids[t]:
        #ex_task_5 = db_taps[long_name['lin5t']].xs(p)
        #ex_task_8 = db_taps[long_name['lin8t']].xs(p)
        #first_part_800 = ex_task_8[10:65]   # (10 to 64) --> 800ms to 700ms
        #second_part_800 = ex_task_8[65:110] # (65 to 109) --> 690ms to 610ms
        #third_part_800 = ex_task_8[110:165] # (110 to 164) --> 600ms to 500ms
        #first_part_500 = ex_task_5[10:65]   # (10 to 64) --> 500ms to 600ms
        #second_part_500 = ex_task_5[65:110] # (65 to 109) --> 610ms to 690ms
        #third_part_500 = ex_task_5[110:165] # (110 to 164) --> 590ms to 500ms
        
        #but it comes out the same for all tasks:
        taps = taps_filtered[t].xs(p)
        
        first_part  = taps[10:65]
        second_part = taps[65:110]
        third_part  = taps[110:165]
        
        linear_part_taps[subtask_name_A][p] = first_part
        linear_part_taps[subtask_name_B][p] = second_part
        linear_part_taps[subtask_name_C][p] = third_part
    
linear_part_names = linear_part_taps.keys()
# If we want to add this to the DFO, we'll need to translate the dict to a pd.concat set of pids.
#for (far, df) in linear_part_taps.items():
#    taps_filtered[var] = df 
    
#linear_part_taps[t + "ptB"]['110'].dev_perc.plot(figsize=(5,2))
#plt.show()
#---------------------
#sub_tasks = linear_part_taps.keys()
reshaped = {'Tick_Lin_500600': OrderedDict(),
            'Tick_Lin_610690': OrderedDict(),
            'Tick_Lin_700800': OrderedDict(),
            'Jit_Lin_500600': OrderedDict(),
            'Jit_Lin_610690': OrderedDict(),
            'Jit_Lin_700800': OrderedDict(),
            }
lin_tick_both = sorted(set(task_pids['Ticks_Linear_5'])
                       .intersection(task_pids['Ticks_Linear_8']))
lin_jits_both = sorted(set(task_pids['Jits_Linear_5'])
                       .intersection(task_pids['Jits_Linear_8']))
for p in lin_tick_both:
        
    t500600 = pd.concat(axis=0, objs = [linear_part_taps['Ticks_Linear_5ptA'][p],
                                        linear_part_taps['Ticks_Linear_8ptC'][p]])
    
    t610690 = pd.concat(axis=0, objs = [linear_part_taps['Ticks_Linear_5ptB'][p],
                                        linear_part_taps['Ticks_Linear_8ptB'][p]])
    
    t700800 = pd.concat(axis=0, objs = [linear_part_taps['Ticks_Linear_5ptC'][p],
                                        linear_part_taps['Ticks_Linear_8ptA'][p]])
    
    reshaped['Tick_Lin_500600'][p] = t500600
    reshaped['Tick_Lin_610690'][p] = t610690
    reshaped['Tick_Lin_700800'][p] = t700800  
    
for p in lin_jits_both:
    
    j500600 = pd.concat(axis=0, objs = [linear_part_taps['Jits_Linear_5ptA'][p],
                                        linear_part_taps['Jits_Linear_8ptC'][p]])
    
    j610690 = pd.concat(axis=0, objs = [linear_part_taps['Jits_Linear_5ptB'][p],
                                        linear_part_taps['Jits_Linear_8ptB'][p]])
    
    j700800 = pd.concat(axis=0, objs = [linear_part_taps['Jits_Linear_5ptC'][p],
                                        linear_part_taps['Jits_Linear_8ptA'][p]])
    
    reshaped['Jit_Lin_500600'][p] = j500600
    reshaped['Jit_Lin_610690'][p] = j610690
    reshaped['Jit_Lin_700800'][p] = j700800
for t in ['Tick_Lin_500600', 'Tick_Lin_610690', 'Tick_Lin_700800']:
    task_pids[t] = lin_tick_both
    
for t in ['Jit_Lin_500600', 'Jit_Lin_610690', 'Jit_Lin_700800']:
    task_pids[t] = lin_jits_both
   
    
reshaped_dfs = OrderedDict()
for (taskname, task_p_dict) in reshaped.items():
    print(taskname)
    pids_df = pd.concat(axis=0, objs=task_p_dict.values(), keys=task_p_dict.keys(), names=['pid'])
    reshaped_dfs[taskname] = pids_df
    
for (var, df) in reshaped_dfs.items():
    taps_filtered[var] = df 
    
#####    
reshaped_dfs['Tick_Lin_700800'][::1000]
    
    
    Out[219]:
In [68]:
    
taps_filtered['Tick_Lin_610690']
    
    Out[68]:
In [222]:
    
#taps_filtered.keys()
short_name['Jits_Phase_8_postshift_noreplace'] = 'phase8j_psk'  #post shift, kept nans
short_name['Jits_Phase_8_postshift_replaced'] = 'phase8j_psr'   #post shift, replaced nans
short_name['Ticks_Phase_8_postshift_noreplace'] = 'phase8tp_psk'
short_name['Ticks_Phase_8_postshift_replaced'] = 'phase8t_psr'
short_name['Jits_Phase_5_postshift_noreplace'] = 'phase5j_psk'
short_name['Jits_Phase_5_postshift_replaced'] = 'phase5j_psr'
short_name['Ticks_Phase_5_postshift_noreplace'] = 'phase5t_psk'
short_name['Ticks_Phase_5_postshift_replaced'] = 'phase5t_psr'
short_name['Jits_Phase_8_mperiod_noreplace'] = 'phase8j_mpk'  #post shift, kept nans
short_name['Jits_Phase_8_mperiod_replaced'] = 'phase8j_mpr'   #post shift, replaced nans
short_name['Ticks_Phase_8_mperiod_noreplace'] = 'phase8tp_mpk'
short_name['Ticks_Phase_8_mperiod_replaced'] = 'phase8t_mpr'
short_name['Jits_Phase_5_mperiod_noreplace'] = 'phase5j_mpk'
short_name['Jits_Phase_5_mperiod_replaced'] = 'phase5j_mpr'
short_name['Ticks_Phase_5_mperiod_noreplace'] = 'phase5t_mpk'
short_name['Ticks_Phase_5_mperiod_replaced'] = 'phase5t_mpr'
short_name['Ticks_Phase_5_normal'] = 'phase5t_nrm'
short_name['Ticks_Phase_8_normal'] = 'phase8t_nrm'
short_name['Jits_Phase_5_normal'] = 'phase5j_nrm'
short_name['Jits_Phase_8_normal'] = 'phase8j_nrm'
short_name['Tick_Lin_500600'] = 'lint_500600'
short_name['Tick_Lin_610690'] = 'lint_610690'
short_name['Tick_Lin_700800'] = 'lint_700800'
short_name['Jit_Lin_500600'] = 'linj_500600'
short_name['Jit_Lin_610690'] = 'linj_610690'
short_name['Jit_Lin_700800'] = 'linj_700800'
    
In [223]:
    
tasknames = task_frames.keys()
# create a dataframe with an index for every PID that is represented
# in at least one of the tasks' data
pids_all_sms = set()
for (t, df) in task_frames.items():
    pids = df.index.get_level_values('pid').unique()
    pids_not_excl = [p for p in pids if p not in excluded_all_tasks]
    pids_all_sms = pids_all_sms.union(pids_not_excl)
dfo = pd.DataFrame(index = sorted(pids_all_sms))
dfo.index.name = 'pid'
# Iterate through all combinations of tasks, measures, statistics,
# participants. (the valid PIDs are different between tasks, so
# this selects the correct pids. But it might be more elegant to use an
# all-possible-combinations function across all four lists, and handle 
# an exception when a PID isn't represented in a given task's data.)
def stat_combo_gen(tasks, measures, statfuncs, statkwargs):
    
    kwargs_all = {k: {} for k in statfuncs} #default, no kwargs
    
    for k, v in statkwargs.items():
        kwargs_all[k] = v
        
    for task in tasks:
        for measure in measures:
            for statfunc in statfuncs:
                for p in task_pids[task]:
                    yield (task, measure, statfunc, kwargs_all, p)
filt_tasknames = taps_filtered.keys()
#stat_combos = stat_combo_gen(tasks = tasknames, # ['T1_SMS_8'],
stat_combos = stat_combo_gen(tasks = filt_tasknames, # ['T1_SMS_8'],
                             measures = ['dev_perc', 'dev', 'ints'],
                             statfuncs = ['mean', 'std', 'count'],
                             statkwargs = {'std': {'ddof': 1}}
                             )
for (task, measure, statfunc, kwargs, p) in stat_combos:
    #print((task, measure, statfunc, p))
    
    ptaps = taps_filtered[task].xs(p, level='pid')
    mseries = ptaps[measure]
    statfunction = getattr(mseries, statfunc)
    result = statfunction(**kwargs[statfunc])
    
    output_varname = '_'.join([short_name[task], measure, statfunc])
    output_varname = output_varname.replace("dev_perc_mean", "DPm")
    output_varname = output_varname.replace("dev_perc_std", "DPsd")
    output_varname = output_varname.replace("dev_perc_count", "DPct")
    
    if output_varname not in dfo:
        print(output_varname, end=", ")
        dfo[output_varname] = np.nan        
    dfo[output_varname].loc[p] = result
    
    
In [224]:
    
dfo.T
# We need a good way of dealing with missed beats in the post-shift periods.
# We want missed beats to "count against" the participant there.
# So: we'll use an outlier criterion for truncating (not removing) here, and 
# but we'll apply it identically across participants (rather than relative to 
# individual participants' overall variability). When a tap is missed, we
# apply the maximum (truncation) value, so that a missed tap is as bad as the
# worst inaccuracy that's kept on record for participants.
    
    Out[224]:
In [225]:
    
dfo_output_updated = '2014-10-20a'
prefix = "c:/db_pickles/pickle - dfo-sms - "
import cPickle as pickle
output_file= prefix + dfo_output_updated + '.pickle'
pickle.dump(dfo, open(output_file, "wb"))
# Proceed with pickle to Part 5     
    
dfo.head(8).T
    
    Out[225]:
In [239]:
    
#st = 'Ticks_Linear_5ptB'
for st in ['Ticks_Linear_5ptA',
           'Ticks_Linear_5ptB',
           'Ticks_Linear_5ptC', ]#linear_part_names:
    
    print('\n\n %s \n===================\n' % st)
    task_taps = linear_part_taps[st]
    for p in task_pids[st]:
        print(p)
        taps = task_taps[p].dev_perc
        print('\t unfiltered')
        taps.hist(figsize=(5,1.2), color='blue')
        plt.show() #remove this to plot both in the same figure
        print('\t filtered')
        filt = taps[(taps <= 20) & (taps >= -35)]
        try:
            filt.hist(figsize=(5,1.2), color='green')
            plt.show()
        except ValueError:
            print("ZERO SIZE, CANNOT PLOT")
        #if p=="025": break
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [113]:
    
#NO LONGER USING THIS-- SECTIONS ARE COLLAPSED TOGETHER ABOVE
# First pass: determine the truncation value for each task. What's
# z = 2.97 for all taps across all participants within a given
# section of a given task?
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8',
               'Jits_Phase_5',  'Jits_Phase_8', ]
psection_list = ['0a', #0b isn't a separate section
                 '1a', '1b', '2a', '2b', 
                 '3a', '3b', '4a', '4b']
trunc_value = {t: {} for t in phase_tasks}
reindexed_sections = {t: {} for t in phase_tasks}
reindexed_stacked = {}
for t in phase_tasks:
    tdata = taps_filtered[t]
    #not splitting by participant-- calc across all p's
    for section in psection_list:
        #print(section)
        section_ident_column = 'is_range_' + section
        section_taps = tdata[tdata[section_ident_column]==True]
        #print(section_taps.head())
        section_dps = section_taps.dev_perc
        #print(section_dps.count())
        #print(section_dps.std())
        section_mean = section_dps.mean()
        section_sd = section_dps.std()
        
        trunc = {'upper': section_mean + 2.97 * section_sd,
                 'lower': section_mean - 2.97 * section_sd}
        
        trunc_value[t][section] = trunc
        print("{}, {}: <{}, >{}".format(t, section, 
                                  trunc['lower'], trunc['upper']))
        
        #fill in all beat values (they were skipped when we built
        # this dataframe, so there aren't any NaN values in place yet)
        sec_start = section_dps.index.get_level_values('beat').min()
        sec_end = section_dps.index.get_level_values('beat').max()
        
        dps = {pid: section_dps.xs(pid) for pid in task_pids[t]}
        p_rows = pd.DataFrame(dps)       
        p_cols = p_rows.T
        dps_reindexed = p_cols.reindex(columns=range(sec_start,sec_end+1))
        dps_reindexed.index.name = 'pid'
        
        reindexed_sections[t][section] = dps_reindexed
        
        
        #Not actually doing this yet: just looking at what happens to the values
        # in each task based on the trunc values we just found.
        stacked = dps_reindexed.stack(dropna=False)        
        print('trunc <: %s' % stacked[stacked < trunc['lower']].count())
        print('trunc >: %s' % stacked[stacked > trunc['upper']].count())
        print('blank: %s' % len(stacked[stacked.isnull()])) #count() excludes NaNs!
        
        #print(section.isnull().count())
    
    
In [ ]:
    
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8',
               'Jits_Phase_5',  'Jits_Phase_8', ]
psection_list = ['0a', #0b isn't a separate section
                 '1a', '1b', '2a', '2b', 
                 '3a', '3b', '4a', '4b']
for task in phase_tasks:
    
    print(task)
    task_sections = reindexed_sections[task]
    for section in psection_list:
        print(section)
        truncs = trunc_value[task][section]
        for pid in task_pids[task]:
            print(pid, end=':')
            ptaps = task_sections[section].loc[pid]
            
            # only measure we're using is dev_perc now, since that's 
            
            #Truncate outliers and impute missing beats with truncation value
            # (randomly select upper or lower truncation value, I guess?)
            
            def trunc_and_replace_nans(i):
                
                #def pick_trunc_side():
                #    import random 
                #    r = random.randint(0, 1)
                #    if r==1:
                #        return truncs['upper']
                #    else:
                #        return truncs['lower']
                
                if i > truncs['upper']: 
                    i = truncs['upper']
                elif i < truncs['lower']:
                    i = truncs['lower']
                elif np.isnan(i): 
                    i = truncs[prev_trunc] 
                    #avoid randomization.... was pick_trunc_side()
                    
                return i
                
            stats_before = (ptaps.max(), ptaps.min(), ptaps.mean())
            prev_trunc = 'upper' #default
            
            ptaps_post = ptaps.apply(trunc_and_replace_nans)
            stats_after = (ptaps_post.max(), ptaps_post.min(), ptaps_post.mean())
            if stats_before==stats_after:
                print("no change: {:f} {:f} {:f}"
                      .format(stats_before[0], stats_before[1], stats_before[2]))
            else:
                print("changed. before: {:f} {:f} {:f}, after: {:f} {:f} {:f}"
                      .format(stats_before[0], stats_before[1], stats_before[2],
                              stats_after[0], stats_after[1], stats_after[2]))
            result_m = ptaps_post.mean()
            result_sd = ptaps_post.std()
            result_ct_pre = ptaps.count()  #AFTER replacements!
            result_ct_post = ptaps_post.count()  #AFTER replacements!
            
            task_sec_name = '_'.join([short_name[task], 's' + section])
            output_varname_m = task_sec_name + '_DPm'
            output_varname_sd = task_sec_name + '_DPsd'
            output_varname_ct_pre = task_sec_name + '_DPctPre'
            output_varname_ct_post = task_sec_name + '_DPctPost'
            
            if output_varname_m not in dfo:
                dfo[output_varname_m] = np.nan
                print('added varname: %s' % output_varname_m)
            if output_varname_sd not in dfo:
                dfo[output_varname_sd] = np.nan
                print('added varname: %s' % output_varname_sd)
            if output_varname_ct_pre not in dfo:
                dfo[output_varname_ct_pre] = np.nan
                print('added varname: %s' % output_varname_ct_pre)
            if output_varname_ct_post not in dfo:
                dfo[output_varname_ct_post] = np.nan
                print('added varname: %s' % output_varname_ct_post)
                
            dfo[output_varname_m].loc[pid] = result_m
            dfo[output_varname_sd].loc[pid] = result_sd
            dfo[output_varname_ct_pre].loc[pid] = output_varname_ct_pre
            dfo[output_varname_ct_post].loc[pid] = output_varname_ct_post
            
            print()            
        print()
    print()
#change output: max, min, mean
    
In [116]:
    
dfo.T
    
    Out[116]:
In [227]:
    
#sms_tasknames = ['Ticks_Linear_5',
#                 'Ticks_Linear_8',
#                 'Ticks_Phase_5',
#                 'Ticks_Phase_8',
#                 'T1_SMS_5',
#                 'Ticks_ISO_T2_5',
#                 'T1_SMS_8',
#                 'Ticks_ISO_T2_8',]
def cool_sms_plot(participant_task_df, ISI): 
    
    from pandas.stats.moments import rolling_mean
    
    lb_test = participant_task_df.xs('target', level='stamp')
    tap_test = participant_task_df.xs('tap', level='stamp')   
    
    lb_test.plot(x = 'task_ms', y = 'tinterval', figsize=(13,7), linewidth=4)
    
    tap_test['dev_relative'] = tap_test.dev + ISI
    tap_test.plot(x = 'task_ms', y = 'dev_relative', marker="o", linewidth=0) #, figsize=(14,9))
    
    tap_test['rmean'] = rolling_mean(tap_test.dev_relative, window=10, center=True, min_periods=2)
    tap_test.plot(x = 'task_ms', y = 'rmean', linewidth=2)
    
    #print(tap_test.dev)
    #need to do this in MPL directly so I can return a plot variable
#cool_sms_plot(sms_tasknames[0], '102')
#cool_sms_plot('Ticks_ISO_T2_5', '103')
#could use this in manuscript to demonstrate pre- and post-filtered data...
task = long_name['phase8j']
pid = '116'
print(pid)
cool_sms_plot(task_frames[task].xs(pid), 800)
    
    
    
In [229]:
    
def fig_dims(width, factor):
    #WIDTH = 350.0  # the number latex spits out
    #FACTOR = 0.45  # the fraction of the width you'd like the figure to occupy
    fig_width_pt  = width * factor
    inches_per_pt = 1.0 / 72.27
    golden_ratio  = (np.sqrt(5) - 1.0) / 2.0  # because it looks good
    fig_width_in  = fig_width_pt * inches_per_pt  # figure width in inches
    fig_height_in = fig_width_in * golden_ratio   # figure height in inches
    fig_dims      = [fig_width_in, fig_height_in] # fig dims as a list
    return fig_dims
def task_variability(taskname):
    
    #tdata = db_taps[taskname]
    tdata = taps_filtered[taskname]    
    #plt.suptitle(short_name[taskname])
    ISI = sms_params[taskname]['ISI']
    if ISI == '(varies)': ISI = 650
        
    avgdevs = tdata.mean(level='beat')[5:]
    SD_devs = tdata.std(level='beat')[5:]
    avgtargs = task_frames[taskname].xs('target',level='stamp').mean(level='beat')
    figsize = fig_dims(2000, 0.45)    
    ax = avgtargs.plot(y = 'tinterval', linewidth=2, color='black', figsize=figsize)        
    #ax.plot(avgdevs)
        
    adjust_avgdevs = avgdevs + ISI
    #adjust_avgdevs.plot(y = 'dev', linewidth=2)    
    #upper_sd = adjust_avgdevs + (SD_devs)
    #lower_sd = adjust_avgdevs - (SD_devs)
    #plt.fill_between(x='task_ms', y1=upper_sd.dev, y2=lower_sd.dev, color='grey', alpha='0.5')
    #upper_sd.plot(y = 'dev', linewidth=1)
    #lower_sd.plot(y = 'dev', linewidth=1)
    #upper_sd = ISI + SD_devs
    #lower_sd = ISI - SD_devs
    
    avg_tap = avgtargs.tinterval + avgdevs.dev
    upper_sd = avg_tap + SD_devs.dev
    lower_sd = avg_tap - SD_devs.dev
        
    
    #upper_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
    #lower_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
    
    #ax.plot(upper_sd.dev, linewidth=3, color='black', linestyle="--")
    #ax.plot(lower_sd.dev, linewidth=3, color='black', linestyle="--")
    
    avg_tap.plot(linewidth=1, color='black', linestyle="--", dashes=(5,3))
    upper_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
    lower_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
        
    ax.set_ylabel("Milliseconds")
    ax.set_xlabel("Interval number")
    
    ax.grid(b=False, which='major', axis='both')
    
    # set number of labeled "ticks" on each axis (overriding auto setting)
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
    ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(10))
    # (it will sometimes decide to show fewer than this, hence "max")
    
    # Or to be precise:
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
    
    #ax.xaxis.tick_bottom()
    #ax.yaxis.tick_left()
    #ax.spines["right"].set_color("none")
    #ax.spines["top"].set_color("none")
    
    ax.legend(["Target IOI",
               "IOI + mean of absolute asynchrony values",
               u"Between-participants variability in mean asynchrony (IOI ± 1 SD)"], loc="best")    
    ax.get_legend().set_title("")
    ax.get_legend().draw_frame(False)
    plt.savefig("c:/_Sync/1020a_postfilt_varlines_" + short_name[t] + '.png',
                format='png',
                )
    plt.show()
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None  
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
for t in sms_tasknames:
    print(short_name[t])
    print("N = ", len(db_taps[t].index.get_level_values('pid').unique()))
    task_variability(t) #long_name['iso5t1'])
    #break
     
    
# iso5t1 and iso8t1: Need to remove the extra intervals at the 
# end of the task from the first few subs! (after beat 130-ish?)
# (Probably easiest and less confusing for future readers if they're just
#  chopped out of the CSV file at the start.)
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [31]:
    
mpl.rcdefaults()
rcParams #.keys()
#avgdevs = db_taps[long_name['phase8t']].mean(level='beat').dev
    
    Out[31]:
In [102]:
    
def plot_settings():
    #don't adjust MPL defaults to pandas's preferred defaults
    pd.options.display.mpl_style = None  
    mpl.rcdefaults()
    from matplotlib import rcParams
    #rcParams['axes.titlesize'] = 22
    rcParams['font.size'] = 14
    rcParams['xtick.labelsize'] = 12
    rcParams['ytick.labelsize'] = 12
    rcParams['legend.fontsize'] = 12
    rcParams['font.family'] = 'serif'
    rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
plot_settings()
phase5t = db_taps[long_name['phase5t']]
phase8t = db_taps[long_name['phase5t']]
avgdevs = {5: phase5t.mean(level='beat')[5:],
           8: phase8t.mean(level='beat')[5:], }
sd_devs = {5: phase5t.std(level='beat')[5:],
           8: phase8t.std(level='beat')[5:], }
sd_devs[5].dev_perc.plot(color='red')
for shift_beat in [97, 114, 131, 150]:
    plt.axvline(x=shift_beat, color='black', ymin=0, ymax=1.0, linewidth=1)
    
    
In [24]:
    
import re
def col_find(df, regex):    
    cols = list(enumerate(df.columns))
    matches = [#'%d. %s' % 
               (i, c) 
                     for (i, c) in cols
                     #if filt in c
                     if re.findall(regex, c)
                     ]
    #print('\n'.join(matches))
    return matches
#filt = r"(^J)(.*)(d$)"
#cf = col_find(dfo, filt)
#import itertools
#list(itertools.combinations(cf, 2))
    
In [25]:
    
def inverse_scatter(dfo, ilocx, ilocy, *args, **kwargs):    
    inversed = lambda df: 1.0/df
    df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
                         inversed(dfo.T.iloc[ilocy])],
                         axis=1)
    df_temp.plot(x=0,y=1, kind='scatter', **kwargs)
    plt.show()
    print('r = %f' % df_temp.corr().iloc[0,1])
def inverse_correl(dfo, ilocx, ilocy, **kwargs):
    inversed = lambda df: 1.0/df
    df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
                         inversed(dfo.T.iloc[ilocy])],
                         axis=1)
    print('r = %f' % df_temp.corr().iloc[0,1])
    
    
inverse_scatter(dfo, 73, 79, figsize=(5,5))
    
    
    
In [26]:
    
#NEW DATA VERSION....
percstds = col_find(dfo, r'.*perc_std')
import itertools
pairs = list(itertools.combinations(percstds, 2))
pair_nums = [(x[0], y[0]) for x, y in pairs]
for (x, y) in pair_nums:
    inverse_scatter(dfo, x, y, figsize=(5, 5))
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [29]:
    
# original scale plots for comparison
def scatter_by_colnum(dfo, ilocx, ilocy, *args, **kwargs):    
    #inversed = lambda df: 1.0/df
    df_temp = pd.concat([dfo.T.iloc[ilocx],
                         dfo.T.iloc[ilocy]],
                         axis=1)
    df_temp.plot(x=0,y=1, kind='scatter', **kwargs)
    plt.show()
    print('r = %f' % df_temp.corr().iloc[0,1])
    
import itertools
pairs = list(itertools.combinations(percstds, 2))
pair_nums = [(x[0], y[0]) for x, y in pairs]
for (x, y) in pair_nums:
    scatter_by_colnum(dfo, x, y, figsize=(5, 5))
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [26]:
    
def sideplots(series_top, series_bottom, 
              plotname_top="pre-filter",
              plotname_bottom="post-filter"):
    from matplotlib import pyplot as plt
    fig, axes = plt.subplots(nrows=2, ncols=3)
    
    fig.set_figheight(7)
    fig.set_figwidth(15)
    
    #fig.suptitle('t', fontsize=25)
    #plt.xlabel('xlabel', fontsize=18)
    #plt.ylabel('ylabel', fontsize=16)
    
    ax1 = plt.subplot2grid((2,3), (0,0), colspan=2)
    ax2 = plt.subplot2grid((2,3), (1,0), colspan=2)
    
    ax3 = plt.subplot2grid((2,3), (0, 2)) #, rowspan=2)
    ax4 = plt.subplot2grid((2,3), (1, 2))
    #ax5 = plt.subplot2grid((4,4), (2, 1))
    
    ax1.set_title(plotname_top, fontsize=16)
    ax2.set_title(plotname_bottom, fontsize=16)
    ax3.set_title(plotname_top, fontsize=16)
    ax4.set_title(plotname_bottom, fontsize=16)
    #ax5.set_title('ax5 title', fontsize=35)
#    series_l.plot(ax=axes[0,0], linewidth=3)
#    series_r.plot(ax=axes[0,1], linewidth=3)
#    series_l.hist(ax=axes[1,0])
#    series_r.hist(ax=axes[1,1])
    series_top.plot(ax=ax1, linewidth=3)
    series_bottom.plot(ax=ax2, linewidth=3)
    series_top.hist(ax=ax3, bins=20)
    series_bottom.hist(ax=ax4, bins=20)
    fig.tight_layout()
    
In [27]:
    
#not using this at the moment
test_params = {'ISI': 500,
               'filter_outliers_beyond_x_stdevs': 3,
               #'min_percentISI_deviation_counted_as_failure': 40,
               'stdev_calcs_exclude_n_from_left': 2,
               'stdev_calcs_exclude_n_from_right': 2,
               #'stimulus_style': 'tick',
               #'stimulus_timing': 'iso',
               'wait_beats_after_subj_start': 6,
               'wait_beats_after_task_start': 9, }
#test_task = long_name['iso5t2'] #'Ticks_ISO_T2_5'
#test_pids = ['101', '102', '103', '104', '105', '107']
def before_after_plots(taskname, pid):
    unfilt = db_taps[taskname].xs(pid)
    filtered = filter_taps(unfilt, task_params=sms_params[taskname])
    print("{0} taps ==> {1} taps".format(len(unfilt), len(filtered)))
    #fig = sideplots(test_taps.dev_perc, filtered.dev_perc)
    outliers_removed = filtered[filtered.is_outlier != True]
    print('pre-filter:\t' +
          'sd= {sd} \t mean= {mean} \t md= {md}'.format(sd=unfilt.dev_perc.std(),
                                                        mean=unfilt.dev_perc.mean(),
                                                        md=unfilt.dev_perc.median()))
    print('post-filter:\t' +
          'sd= {sd} \t mean= {mean} \t md= {md}'.format(sd=filtered.dev_perc.std(),
                                                        mean=filtered.dev_perc.mean(),
                                                        md=filtered.dev_perc.median()))    
    fig = sideplots(unfilt.dev_perc, 
                    outliers_removed.dev_perc,
                    plotname_top = "%s, P. %s, pre-filter" % (short_name[taskname], pid),
                    plotname_bottom = "P. {}, post-filter".format(pid))
    #plt.show()
    return fig
    
            
def too_many_plots(**kwargs):
    gen_tasks_pids = general_task_pid_iterator(**kwargs)
    for t, pid in gen_tasks_pids:
        #plt.figure()
        fig = before_after_plots(t, pid)
        #plt.show()
        yield fig
   
next_plot = too_many_plots()
for i in range(2):
#    plt.figure()
#    fig = next_plot.next()
    plt.show(next_plot.next())
#skip_to = ('T1_SMS_8', '115')
#start_later = too_many_plots(skip_to_task='T1_SMS_8', skip_to_pid='080')
#
#for i in range(2):
#    plt.figure()
#    fig = start_later.next()
#    plt.show()
    
    
    
    
    
In [24]:
    
from matplotlib.backends.backend_pdf import PdfPages
pp = PdfPages('C:/db_pickles/multipage-big.pdf')
tpid_plots = too_many_plots()
#for i in tpid_plots:
    plotgrid = next_plot.next()
    pp.savefig(plotgrid)
    plt.close() #prevents output from displaying to user
else:
    pp.close()
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [40]:
    
import psutil
free_megs = psutil.virtual_memory()[1] / 1000000
free_megs
    
    Out[40]:
In [ ]:
    
#separate files for tasks
from matplotlib.backends.backend_pdf import PdfPages
iterator = general_task_pid_iterator(concise_labels=False, 
                                     skip_to_task='Ticks_Phase_8')
prev_t = None
pp = PdfPages('C:/db_pickles/empty.pdf')
              
while True:
    try:
        t, pid = iterator.next()
    except StopIteration:
        pp.close()
        break
        
    if prev_t != t:
        pp.close()
        pp = PdfPages('C:/db_pickles/sideplots - 2014-09-23b2 - %s.pdf' % short_name[t])
    print("{} megs free memory".format(psutil.virtual_memory()[1] / 1000000))
    fig = before_after_plots(t, pid)
    pp.savefig(fig)
    plt.close()
    prev_t = t
pp.close()
    
In [37]:
    
def task_side_plotter_pdf(task_name):
    
    from matplotlib.backends.backend_pdf import PdfPages
    
    t = task_name
    file_out = 'C:/db_pickles/sideplots - 2014-09-26c1 - %s.pdf' % short_name[t]
        
    with PdfPages(file_out) as pp:
        for pid in task_pids[task_name]:
            print("{} megs free memory".format(psutil.virtual_memory()[1] / 1000000))
            fig = before_after_plots(t, pid)
            pp.savefig(fig)
            plt.close()
        pp.close()
for t in sms_tasknames:
    task_side_plotter_pdf(t)
    
    
    
In [38]:
    
for i in general_task_pid_iterator(concise_labels=True):
    pass
    
    
In [260]:
    
# OUTLIER LABELING - removed for now
    adjusted_devperc_mean = rem_worst.mean()    
    taps['outlier_metric'] = taps.dev_perc / adjusted_devperc_mean
    
    devperc_limit = rem_worst.std() * rem_beyond_stds    
    taps['is_outlier'] = (  (taps.dev_perc - adjusted_devperc_mean > devperc_limit) 
                          | (taps.dev_perc - adjusted_devperc_mean < -1 * devperc_limit) )
    
    
    
    if print_results:
        print('worst deviations (% of ISI):')
        print(list(temp_devperc_ordered[:nworst_left]))
        print(list(temp_devperc_ordered[-1 * nworst_right:]))
        print('original mean: %s' % round(taps.dev_perc.mean(), 1))
        print('adjusted mean: %s' % round(adjusted_devperc_mean), 1)
        print('original stdv: %s' % round(taps.dev_perc.std(), 1))
        print('adjusted stdv: %s' % round(rem_worst.std(), 1))
        outliers = taps[taps.is_outlier]
        if len(outliers) > 0:
            print('outlier deviations (% of ISI):')
            print(list(outliers.dev_perc.round(decimals=2)))
        else:
            print('No outliers.')
    
    taps.set_index('beat', inplace=True)
    return taps
    
    
In [3]:
    
#filtering would have to take place here, since we aren't assigning "outliers" 
#in the earlier processing steps anymore
db_taps_filt = {t: df[df.is_outlier==False]
                for (t, df) in db_taps.items()}
    
In [4]:
    
db_taps_filt['T1_SMS_5']
    
    Out[4]:
In [195]:
    
task_frames.keys()
    
    Out[195]:
In [196]:
    
# Goofing off
def list_summary(ls, head=5, tail=5):
    lhead = list(ls[:head]) 
    ltail = list(ls[-tail:])
    return ' '.join(lhead + ['...'] + ltail)
def tabbed_dict(d):
    maxlength = max([len(h) for h in d.keys()])
    set_tabs = 1 + maxlength//8
    outd = {}
    for k, v in d.items():
        add_tabs = set_tabs - len(k)//8
        outk = k + '\t' * add_tabs
        outd[outk] = v
    return outd
for k, v in tabbed_dict(task_pids).items():
    print(k + list_summary(v))
    
    
In [197]:
    
def tapxs(df):
    return df.xs('tap', level='stamp')
def tapxsp(df, pid):
    return (df.xs(pid,   level='pid')
              .xs('tap', level='stamp'))
def db_ptap(task, pid):
    return (task_frames[task].xs(pid, level='pid')
                             .xs('tap', level='stamp'))
    
In [199]:
    
df = task_frames[t].xs('098') #.xs('tap', level='stamp')
df[-30:] #.sort().iloc[-30:]
    
    Out[199]:
In [206]:
    
import re
def col_find(df, regex):    
    cols = list(enumerate(df.columns))
    matches = [#'%d. %s' % 
               (i, c) 
                     for (i, c) in cols
                     #if filt in c
                     if re.findall(regex, c)
                     ]
    #print('\n'.join(matches))
    return matches
filt = r"(^J)(.*)(d$)"
cf = col_find(dfo, filt)
import itertools
list(itertools.combinations(cf, 2))
    
    Out[206]:
In [213]:
    
#df_X = dfo.T.iloc[49]  #'Jits_ISO_5_dev_perc_std'
#df_Y = dfo.T.iloc[55]  #'Jits_ISO_8_dev_perc_std'
def inverse_scatter(dfo, ilocx, ilocy, *args, **kwargs):    
    inversed = lambda df: 1.0/df
    df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
                         inversed(dfo.T.iloc[ilocy])],
                         axis=1)
    df_temp.plot(x=0,y=1, kind='scatter', **kwargs)
    plt.show()
    print('r = %f' % df_temp.corr().iloc[0,1])
def inverse_correl(dfo, ilocx, ilocy, **kwargs):
    inversed = lambda df: 1.0/df
    df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
                         inversed(dfo.T.iloc[ilocy])],
                         axis=1)
    print('r = %f' % df_temp.corr().iloc[0,1])
    
inverse_scatter(dfo, 73, 79, figsize=(5,5))